!!- Decription of dataset Gene expression analyses in leukocytes sorted from normal breast tissues, ductal carcinomas in situ (DCIS), and HER2+ and triple negative invasive ductal carcinomas (IDC) Source name resected breast tumor Organism Homo sapiens Characteristics tissue of origin: breast (tumor) cell type: CD45+CD3+ T cell breast tumor subtype: DCIS
RNA was isolated from purified CD45+CD3+ leukocytes by cell sorting
Results in A1 !! normalization - from the slides Statistics: - MDS Plot - Density Plot - Boxplot
Install all the required packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!requireNamespace("Biobase", quietly = TRUE))
BiocManager::install("Biobase")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE))
BiocManager::install("ComplexHeatmap")
if (!requireNamespace("circlize", quietly = TRUE))
BiocManager::install("circlize")
if (!requireNamespace("gprofiler2", quietly = TRUE))
install.packages("gprofiler2")
Reference the packages
library(Biobase)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.2.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.8
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
## ========================================
library(gprofiler2)
Look at the most common genes of the same subfamily in our dataset
#Find the genes of interest
#Start with strongest signals
#The most common genes of the same subfamily
rownames(exp_filtered) <- exp_filtered$Feature
genes_sample <- data.frame(lapply(rownames(normalized_data), FUN = function(x){unlist(strsplit(x, "(?=[A-Za-z])(?<=[0-9])|(?=[0-9])(?<=[A-Za-z])", perl=TRUE))[c(1,2)]}))
colnames(genes_sample) <- colnames(exp_filtered)[2:42]
rownames(genes_sample) <- c("subfamily", "num")
genes_sample <- data.frame(t(genes_sample))
summarized_gene_counts <- sort(table(genes_sample$subfamily),decreasing = TRUE)
head(summarized_gene_counts)
##
## LOC LINC ZNF C SLC FAM
## 1355 632 522 477 380 300
!! LOC - family of RNA gene - not too interesting CD - CD8A CTLA-4 signaling pathways - mentioned in the results of the article multiple times
#CD8A
dcis_samples <- grep(colnames(normalized_data),
pattern="DCIS")
idc_samples <- grep(colnames(normalized_data),
pattern="IDC")
normal_samples <- grep(colnames(normalized_data),
pattern="N")
gene_of_interest <- "CD8A"
CD8A_dcis_samples <- data.matrix(normalized_data
[gene_of_interest,
dcis_samples])
colnames(CD8A_dcis_samples) <- c("dcis_samples")
CD8A_idc_samples <- data.matrix(normalized_data
[gene_of_interest,
idc_samples])
colnames(CD8A_idc_samples) <- c("idc_samples")
CD8A_normal_samples <- data.matrix(normalized_data
[gene_of_interest,
normal_samples])
colnames(CD8A_normal_samples) <- c("normal_samples")
# DCIS VS Normal
t.test(x=t(CD8A_dcis_samples),y=t(CD8A_normal_samples))
##
## Welch Two Sample t-test
##
## data: t(CD8A_dcis_samples) and t(CD8A_normal_samples)
## t = -0.92393, df = 17.895, p-value = 0.3678
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.4529180 0.9548865
## sample estimates:
## mean of x mean of y
## 4.331695 5.080711
# IDC vs Normal
t.test(x=t(CD8A_idc_samples),y=t(CD8A_normal_samples))
##
## Welch Two Sample t-test
##
## data: t(CD8A_idc_samples) and t(CD8A_normal_samples)
## t = -3.8856, df = 26.022, p-value = 0.0006289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.583839 -1.104006
## sample estimates:
## mean of x mean of y
## 2.736788 5.080711
#CTLA4
gene_of_interest <- "CTLA4"
CTLA4_dcis_samples <- data.matrix(normalized_data
[gene_of_interest,
dcis_samples])
colnames(CTLA4_dcis_samples) <- c("dcis_samples")
CTLA4_idc_samples <- data.matrix(normalized_data
[gene_of_interest,
idc_samples])
colnames(CTLA4_idc_samples) <- c("idc_samples")
CTLA4_normal_samples <- data.matrix(normalized_data
[gene_of_interest,
normal_samples])
colnames(CTLA4_normal_samples) <- c("normal_samples")
# DCIS VS Normal
t.test(x=t(CTLA4_dcis_samples),y=t(CTLA4_normal_samples))
##
## Welch Two Sample t-test
##
## data: t(CTLA4_dcis_samples) and t(CTLA4_normal_samples)
## t = 1.4857, df = 13.051, p-value = 0.1611
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5580282 3.0188473
## sample estimates:
## mean of x mean of y
## 2.339701 1.109291
# IDC vs Normal
t.test(x=t(CTLA4_idc_samples),y=t(CTLA4_normal_samples))
##
## Welch Two Sample t-test
##
## data: t(CTLA4_idc_samples) and t(CTLA4_normal_samples)
## t = 3.5966, df = 11.811, p-value = 0.003758
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.475537 6.031031
## sample estimates:
## mean of x mean of y
## 4.862575 1.109291
Look at the article and get the conclusion What does P-value do? P-value = 0.05 and multiple hypothesis correction on the p-values ?
Quasi liklihood DEF - used for more complicated models and is highly recommended for bulk RNASeq experiments. (glmQLFTest)
Design the model
subtypes <- samples$subtype
model_design <- model.matrix(~ 0 + subtypes)
expressionMatrix <- as.matrix(normalized_data[,1:41])
rownames(expressionMatrix) <- rownames(normalized_data)
colnames(expressionMatrix) <- colnames(normalized_data)[1:41]
minSet <- ExpressionSet(assayData=expressionMatrix)
#Fit the model
fit <- lmFit(minSet, model_design)
#Normalize the data
dm <- as.matrix(exp_filtered[,2:42])
rownames(dm) <- exp_filtered$Feature
d <- DGEList(counts=dm, group=samples$subtype)
d <- calcNormFactors(d)
#MDS Plot
plotMDS(d, labels=NULL, pch = 1,
col= c("darkgreen","blue","red")[factor(samples$subtype)])
legend("topright",
legend=levels(factor(samples$subtype)),
pch=c(1), col= c("darkgreen","blue","red"),
title="Class",
bty = 'n', cex = 0.5)
#Estimate Dispersion - our model design.
d <- estimateDisp(d, model_design)
#Fit the model
fit <- glmQLFit(d, model_design)
#Calculate differential expression
#DCIS vs Normal
contrast_dcisVSnormal <- makeContrasts(
dcisVSnormal ="subtypesDCIS-subtypesN",
levels=model_design)
#IDC vs Normal
contrast_idcVSnormal <- makeContrasts(
dcisVSnormal ="subtypesIDC-subtypesN",
levels=model_design)
#Normal vs all
contrast_normal <- makeContrasts(
immunovsrest ="subtypesN-(subtypesDCIS +
subtypesIDC)/2",
levels=model_design)
#Get all the results
qlf.dcis_vs_n <- glmQLFTest(fit, contrast=contrast_dcisVSnormal)
tt_dcis_vs_n <- topTags(qlf.dcis_vs_n,n=nrow(d))
qlf.idc_vs_n <- glmQLFTest(fit, contrast=contrast_idcVSnormal)
tt_idc_vs_n <- topTags(qlf.idc_vs_n,n=nrow(d))
qlf.normal_vs_all <- glmQLFTest(fit, contrast=contrast_normal)
tt_normal_vs_all <- topTags(qlf.normal_vs_all,n=nrow(d))
#How many gene pass the threshold p-value < 0.05?
length(which(tt_dcis_vs_n$table$PValue<0.05))
## [1] 7130
length(which(tt_idc_vs_n$table$PValue<0.05))
## [1] 7401
length(which(tt_normal_vs_all$table$PValue<0.05))
## [1] 8066
#How many genes pass correction?
#using BH as adjusted method
length(which(tt_dcis_vs_n$table$FDR < 0.01))
## [1] 1346
length(which(tt_idc_vs_n$table$FDR < 0.01))
## [1] 1188
length(which(tt_idc_vs_n$table$FDR < 0.01))
## [1] 1188
#Heatmap
heatmap_matrix <- normalized_data[,1:ncol(normalized_data)]
rownames(heatmap_matrix) <- rownames(normalized_data)
colnames(heatmap_matrix) <- colnames(normalized_data[,1:ncol(normalized_data)])
top_hits <- rownames(tt_normal_vs_all)[which(tt_normal_vs_all$table$FDR < 0.01)]
heatmap_matrix_tophits <- t(
scale(t(heatmap_matrix[which(rownames(heatmap_matrix) %in% top_hits),])))
if(min(heatmap_matrix_tophits) == 0){
heatmap_col = colorRamp2(c( 0, max(heatmap_matrix_tophits)),
c( "white", "red"))
} else {
heatmap_col = colorRamp2(c(min(heatmap_matrix_tophits), 0, max(heatmap_matrix_tophits)), c("blue", "white", "red"))
}
current_heatmap <- Heatmap(as.matrix(heatmap_matrix_tophits),
cluster_rows = TRUE, show_row_dend = TRUE,
cluster_columns = TRUE,show_column_dend = FALSE,
col=heatmap_col,show_column_names = FALSE,
show_row_names = FALSE,show_heatmap_legend = TRUE)
n_colours <- c("darkgreen","blue","orange")
names(n_colours) <- unique(samples$subtype)
n <- HeatmapAnnotation(df=data.frame(
type = samples$subtype),
col = list(type = n_colours))
current_heatmap <- Heatmap(as.matrix(heatmap_matrix_tophits),
cluster_rows = TRUE, show_row_dend = TRUE,
cluster_columns = TRUE,show_column_dend = FALSE,
col=heatmap_col,show_column_names = FALSE,
show_row_names = FALSE,show_heatmap_legend = TRUE,
top_annotation = n)
current_heatmap
Up regualted genes
diff_exp <- merge(exp[,1:1],tt_normal_vs_all, by.x=1, by.y = 0)
diff_exp[,"rank"] <- -log(diff_exp$PValue,base =10) * sign(diff_exp$logFC)
length(which(tt_normal_vs_all$table$PValue <= 0.05
& tt_normal_vs_all$table$logFC >= 1))
## [1] 5942
up_genes <- diff_exp$x[which(diff_exp$PValue <= 0.05 & diff_exp$logFC >= 1)]
Down regulated genes
length(which(tt_normal_vs_all$table$PValue < 0.05
& tt_normal_vs_all$table$logFC <= -1))
## [1] 2123
down_genes <- diff_exp$x[which(diff_exp$PValue < 0.05 & diff_exp$logFC <= -1)]
Save all genes lists
write.table(x=up_genes,
file=file.path("up_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=down_genes,
file=file.path("down_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=data.frame(genename= diff_exp$x,F_stat= diff_exp$rank),
file=file.path("ranked_genelist.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
up_gpro <- gost(up_genes, organism = "hsapiens", ordered_query = FALSE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = c("fdr"),
domain_scope = c("annotated"),
custom_bg = NULL, numeric_ns = "", sources = NULL,
as_short_link = FALSE)
head(up_gpro$result)
## query significant p_value term_size query_size intersection_size
## 1 query_1 TRUE 8.599586e-05 16622 4383 3781
## 2 query_1 TRUE 8.599586e-05 16622 4383 3781
## 3 query_1 TRUE 1.950758e-04 13260 4383 3066
## 4 query_1 TRUE 1.950758e-04 4641 4383 1155
## 5 query_1 TRUE 2.033616e-04 8420 4383 2006
## 6 query_1 TRUE 3.662015e-04 14265 4383 3275
## precision recall term_id source
## 1 0.8626512 0.2274696 TF:M01240_0 TF
## 2 0.8626512 0.2274696 TF:M01240 TF
## 3 0.6995209 0.2312217 TF:M01240_1 TF
## 4 0.2635181 0.2488688 TF:M09729_1 TF
## 5 0.4576774 0.2382423 TF:M04106_1 TF
## 6 0.7472051 0.2295829 TF:M00986_0 TF
## term_name effective_domain_size
## 1 Factor: BEN; motif: CAGCGRNV; match class: 0 19887
## 2 Factor: BEN; motif: CAGCGRNV 19887
## 3 Factor: BEN; motif: CAGCGRNV; match class: 1 19887
## 4 Factor: LUMAN; motif: CYCAGCYYCY; match class: 1 19887
## 5 Factor: RUNX2; motif: NRACCGCAAACCGCAN; match class: 1 19887
## 6 Factor: Churchill; motif: CGGGNN; match class: 0 19887
## source_order parents
## 1 429 TF:M01240
## 2 428 TF:M00000
## 3 430 TF:M01240_0
## 4 4072 TF:M09729_0
## 5 6112 TF:M04106_0
## 6 726 TF:M00986
#Plot
gostplot(up_gpro, capped = TRUE, interactive = TRUE)
#View all in a chart instead
publish_gosttable(up_gpro, highlight_terms = up_gpro$result[c(1:10),],
use_colors = TRUE,
show_columns = c("source", "term_name", "term_size", "intersection_size"),
filename = NULL)
down_gpro <- gost(down_genes, organism = "hsapiens", ordered_query = FALSE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = c("fdr"),
domain_scope = c("annotated"),
custom_bg = NULL, numeric_ns = "", sources = NULL,
as_short_link = FALSE)
head(down_gpro$result)
## query significant p_value term_size query_size intersection_size
## 1 query_1 TRUE 7.625172e-04 1121 1466 142
## 2 query_1 TRUE 2.429320e-03 1217 1466 148
## 3 query_1 TRUE 6.657958e-03 797 1466 103
## 4 query_1 TRUE 2.432283e-05 17069 1516 1430
## 5 query_1 TRUE 6.711700e-05 17004 1516 1422
## 6 query_1 TRUE 6.711700e-05 9699 1516 876
## precision recall term_id source
## 1 0.09686221 0.12667261 GO:0071345 GO:BP
## 2 0.10095498 0.12161052 GO:0034097 GO:BP
## 3 0.07025921 0.12923463 GO:0019221 GO:BP
## 4 0.94327177 0.08377761 GO:0005623 GO:CC
## 5 0.93799472 0.08362738 GO:0044464 GO:CC
## 6 0.57783641 0.09031859 GO:0044446 GO:CC
## term_name effective_domain_size source_order
## 1 cellular response to cytokine stimulus 17847 19858
## 2 response to cytokine 17847 9761
## 3 cytokine-mediated signaling pathway 17847 6545
## 4 cell 18842 300
## 5 cell part 18842 2403
## 6 intracellular organelle part 18842 2386
## parents
## 1 GO:0034097, GO:0071310
## 2 GO:0010033
## 3 GO:0007166, GO:0071345
## 4 GO:0005575
## 5 GO:0005575, GO:0005623
## 6 GO:0043229, GO:0044424, GO:0044422
#Plot
gostplot(down_gpro, capped = TRUE, interactive = TRUE)
publish_gosttable(down_gpro, highlight_terms = down_gpro$result[c(1:10),],
use_colors = TRUE,
show_columns = c("source", "term_name", "term_size", "intersection_size"),
filename = NULL)